# -*- coding: utf-8 -*-
import scrapy
from lxml import etree
from sina.items import *


class NewsSpider(scrapy.Spider):
    name = 'news'
    allowed_domains = ['news.sina.com.cn/']
    start_urls = ['http://news.sina.com.cn/']

    def parse(self, response):
        """解析网页"""
        html = response.text
        # print(html)
        etree_html = etree.HTML(html)
        a_list = etree_html.xpath('//li/a')
        for a in a_list:
            item = SinaItem()
            title = a.xpath('./text()')
            item['link'] = a.xpath('./@href')[0]
            if len(title) > 0:
                item['title'] = title[0]
            # print(title,link)
            yield item
